Get Data

mergebinpath <- paste(parent_folder,"e11e12FHmerged", sep = "/" )
merged_bins.dt <- data.table(fread(mergebinpath))
mergemelt.dt <- melt(merged_bins.dt, id.var=c('read_id', 'total_count', 'bc_read'), value.name='Count', variable.name='Sample')
mergemelt.dt[, SampleWell := str_replace_all(Sample, "_cutcount", "")]
rm(merged_bins.dt) #remove original

Get Metadata

#sample metadata 
liblist_path <- paste(parent_folder,"I_sample_metadata.csv", sep = "/" )
liblist_dt <- data.table(fread(liblist_path))
names(liblist_dt)[3] <- 'lib_desc'

#donor sequence metadata
allelelist_path <- paste(parent_folder,"donor_list_fig5.csv", sep = "/" )
seq_annot_dt <- data.table(fread(allelelist_path))
seq_annot_dt[, bc_read := prodonor_seq]
names(seq_annot_dt) <- gsub('_split','', names(seq_annot_dt))

#essential gene metadata
essentials_path <- paste(parent_folder,"essentials.csv", sep = "/" )
essentials_dt <- data.table(fread(essentials_path))

Merge Metadata

#merge sample and seq metadata to big table
annotated_megatable <- liblist_dt[mergemelt.dt, on = "SampleWell", nomatch = 0]
seq_annotated_megatable <- annotated_megatable[seq_annot_dt, on = "bc_read", nomatch = 0]

rows_before_merge = nrow(mergemelt.dt)
rows_after_merge = nrow(seq_annotated_megatable)
pct_rows = rows_after_merge / rows_before_merge
counts_before_merge = sum(mergemelt.dt$Count)
counts_after_merge = sum(seq_annotated_megatable$Count)
pct_counts = counts_after_merge / counts_before_merge

ggplot(
  seq_annot_dt[, list(bc_read, annot_seq=T)][
    annotated_megatable, on = "bc_read"][is.na(annot_seq), annot_seq :=F]) + 
  geom_density(aes(x=Count, color=annot_seq)) + 
  scale_x_log10() + 
  scale_color_discrete('Is Sequence Annotated') +
  labs(
    x='Log10 Read counts per Seq',
    y='Seq Probability Density',
    title='Frequency of Reads vs Annotation') +
  theme_minimal()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 4997789 rows containing non-finite values (stat_density).

rm(mergemelt.dt) #remove original, large file
rm(annotated_megatable) #remove merged, large file


#print statements about merge efficiency
print(paste(as.character(rows_before_merge), " rows annotated of ",
            as.character(rows_after_merge), " total rows, or ",
            as.character(pct_rows*100), "%", sep = ""))
## [1] "7623484 rows annotated of 90750 total rows, or 1.19040060948511%"
print(paste(as.character(counts_before_merge), " counts annotated of ",
            as.character(counts_after_merge), " total counts, or ",
            as.character(pct_counts*100), "%", sep = ""))
## [1] "170758961 counts annotated of 117188649 total counts, or 68.6281108257622%"

Initial Counts

ggplot(seq_annotated_megatable[SampleWell %in% c('E11','E12')]) +
  geom_histogram(aes(x=log10(Count), fill=SampleWell), position='dodge') + 
  facet_wrap(subpool~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1469 rows containing non-finite values (stat_bin).

Calculate Metrics

#calculate frequency for each prodonor, by replicate and timepoint
seq_annotated_megatable[, total_sample_counts := sum(Count), by='SampleWell']

ggplot(
    seq_annotated_megatable[!is.na(vial), list(total_sample_counts= sum(Count)),
        by=c('SampleWell','time_hours','vial')], 
    aes(x=factor(vial), y=factor(time_hours))) +
  geom_tile(aes(fill=log10(total_sample_counts))) + 
  geom_text(aes(label=SampleWell), color='white')

seq_annotated_megatable[Count == 0, Count := 1] #insert pseudocount of 1 AFTER taking sum, and BEFORE determining frequency

seq_annotated_megatable[, seq_freq := Count/total_sample_counts]

#split into 2xstops and recode sets
stops_dt = seq_annotated_megatable[(vial == 4 | vial == 5) & (desc %in% c('0','1'))]
recode_dt = seq_annotated_megatable[(vial == 6 | vial == 7) & !(SampleWell %in% 'F3')]

#for stops_dt, normalize to _R controls mean
stops_dt[, mean_well := mean(seq_freq[is_R == "R"]), by='SampleWell']
stops_dt[, ratio_well := seq_freq/mean_well]

stops_dt[, mean_donor := mean(seq_freq[is_R == "R"]), 
  by=c('gene', 'desc', 'vial', 'time_hours')]
stops_dt[, ratio_donor := seq_freq/mean_donor]

stops_dt[, mean_R_gene := mean(seq_freq[is_R == "R"]), 
  by=c('gene', 'vial', 'time_hours')]
stops_dt[, ratio_R_gene := seq_freq/mean_R_gene] 

stops_dt <- melt(stops_dt[is_R==''], 
  measure.vars=grep('ratio_',names(stops_dt), value=T), variable.name='ratio')

stops_dt[, value_norm := value/value[time_hours==0], 
  by=c('gene', 'desc', 'vial','ratio')]
first_20_genes = head(unique(stops_dt$gene),20)

ggplot(stops_dt[gene %in% first_20_genes], 
    aes(x=time_hours, y=value_norm, color = factor(desc), group=interaction(vial, desc))) + 
  geom_line() +
  geom_point() +
  scale_y_continuous(trans = log10_trans()) + 
  theme_bw() +
  facet_grid(ratio~gene)
## Warning: Removed 20 rows containing missing values (geom_point).

Linear Coefficients

stops_dt[time_hours < 20 & !is.na(value_norm), lm_coeff := lm(log10(value_norm) ~ time_hours)$coefficients[2], by=c('gene','desc','ratio','vial')]

ggplot(stops_dt[gene %in% first_20_genes & time_hours < 20], 
    aes(x=time_hours, y=value_norm, color = factor(desc), group=interaction(vial, desc))) + 
  geom_line() +
  geom_abline(aes(intercept=0, slope=lm_coeff, color = factor(desc)), linetype=2) + 
  geom_point() +
  scale_y_continuous(trans = log10_trans()) + 
  theme_bw() +
  facet_grid(ratio~gene)
## Warning: Removed 12 rows containing missing values (geom_abline).
## Warning: Removed 12 rows containing missing values (geom_point).

stops_essentials_dt <- merge(
  essentials_dt, stops_dt, by.x='Gene', by.y='gene', all=T)

ggplot(
    unique(stops_essentials_dt[!is.na(Gene) & time_hours < 20 & !is.na(`E/N`), 
    list(lm_coeff, `E/N`, Gene, desc, ratio, vial)])) + 
  geom_density(aes(x=lm_coeff, color=`E/N`)) + facet_wrap(desc~ratio)
## Warning: Removed 86 rows containing non-finite values (stat_density).

Incorporate PDT data from Cameron et al

pdt_dt <- fread(paste(parent_folder,"cameron_et_al.supp_tab_2.csv", sep = "/" ))
pdt_dt <- pdt_dt[stops_essentials_dt, on='Gene']

ggplot(pdt_dt) + 
  geom_point(aes(x=rel_growth, y=log10(rel_cfu))) +
  labs(x='Rel Growth (Cameron et al.)', y='Rel CFU (Cameron et al.)')
## Warning: Removed 41239 rows containing missing values (geom_point).

ggplot(pdt_dt) + 
  geom_point(aes(x=rel_growth, y=lm_coeff)) +
  labs(x='Rel Growth (Cameron et al.)', y='RLR LM Coefficient (<20 hrs)')
## Warning: Removed 45325 rows containing missing values (geom_point).

ggplot(pdt_dt[, 
    list(
      mean_lm= mean(lm_coeff, na.rm=T), 
      sd_lm=sd(lm_coeff, na.rm=T), 
      rel_growth, rel_cfu), 
    by=c('Gene')]) + 
  geom_point(aes(x=rel_growth, y=mean_lm)) +
  geom_linerange(aes(x=rel_growth, y=mean_lm, ymin=mean_lm-sd_lm, ymax=mean_lm+sd_lm)) +
  labs(x='Rel Growth (Cameron et al.)', y='RLR LM Coefficient (<20 hrs)')
## Warning: Ignoring unknown aesthetics: y
## Warning: Removed 41293 rows containing missing values (geom_point).
## Warning: Removed 41293 rows containing missing values (geom_linerange).

ggplot(pdt_dt[, 
    list(
      mean_lm= mean(lm_coeff, na.rm=T), 
      sd_lm=sd(lm_coeff, na.rm=T), 
      rel_growth, rel_cfu), 
    by=c('Gene')]) + 
  geom_point(aes(x=log10(rel_cfu), y=mean_lm)) +
  geom_linerange(aes(x=log10(rel_cfu), y=mean_lm, ymin=mean_lm-sd_lm, ymax=mean_lm+sd_lm)) +
  labs(x='log10 Rel CFU (Cameron et al.)', y='RLR LM Coefficient (<20 hrs)')
## Warning: Ignoring unknown aesthetics: y
## Warning: Removed 41293 rows containing missing values (geom_point).
## Warning: Removed 41293 rows containing missing values (geom_linerange).